package com.sentiment.crawler;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.nodes.Element;

import com.sentiment.parser.news.ChinanewsNews;
import com.sentiment.parser.news.IfengNews;
import com.sentiment.parser.news.PeopleNews;
import com.sentiment.parser.news.QqNews;
import com.sentiment.parser.news.SinaNews;
import com.sentiment.parser.news.SohuNews;
import com.sentiment.parser.news.WangyiNews;
/**
 * 一些爬虫爬取过程的辅助工具
 * @author 王骏科
 *
 */
public class CrawlerUtils {
	/**
	 * 根据url获取信息来源的发布网站
	 * 
	 * @param url
	 * @return
	 */
	public static String publisherFilter(String url) {
		if (url.matches(WangyiNews.REGEX)) {
			return "网易新闻";
		} else if (url.matches(SohuNews.REGEX)) {
			return "搜狐新闻";
		} else if (url.matches(SinaNews.REGEX)) {
			return "新浪新闻";
		} else if (url.matches(IfengNews.REGEX)) {
			return "凤凰网";
		} else if (url.matches(ChinanewsNews.REGEX)) {
			return "中国新闻网";
		} else if (url.matches(PeopleNews.REGEX)) {
			return "人民网";
		} else if (url.matches(QqNews.REGEX)) {
			return "腾讯网";
		} else if (url.matches(".*\\.cctv\\..*")) {
			return "央视网";
		} else if (url.matches(".*\\.gmw\\..*")) {
			return "光明网";
		} else if (url.matches(".*\\.xinhuanet\\..*")) {
			return "新华网";
		} else if (url.matches(".*\\.qianlong\\..*")) {
			return "千龙网";
		} else if (url.matches(".*\\.southcn\\..*")) {
			return "南方网";
		} else {
			return "未录入统计";
		}
	}
	
	/**
	 * 从News对象获取的Element中获取正文，
	 * 将正文部分分段存储 List中的每一项为一段
	 * 
	 * @param e News对象中的Element
	 * @return
	 */
	public static List<String> crawlText(Element e) {
		List<String> text = new ArrayList<String>();
		if (e.children() != null) {

			for (Element ele : e.children()) {
				String addText = ele.text().replaceAll("[\\u00a0\\s]", "");
				if (addText != "" && addText != null)
					text.add(addText);
			}
		}
		return text;
	}
}
